{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "c93d5c47",
      "metadata": {},
      "source": [
        "# 导入"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "id": "da0d3900",
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "d:\\softwares\\python\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
            "  from .autonotebook import tqdm as notebook_tqdm\n"
          ]
        }
      ],
      "source": [
        "from sentence_transformers import SentenceTransformer\n",
        "import numpy as np"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "27a9041f",
      "metadata": {},
      "source": [
        "# 加载文件"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "id": "fedbdc84-c52f-4552-8945-ea17ec0bbd85",
      "metadata": {
        "tags": []
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "文本条数:  100\n",
            "预览第一条:  【#文旅文创看洛阳# 】2023#河南省文旅文创发展大会# 本次大会安排了项目签约，主要有两方面内容。一是文旅产业项目签约。截至目前，共梳理41个重点文旅项目，投资总额525.6亿元；遴选21个重大项目进行现场签约，投资总额365.8亿元。这些项目既包括文物数字化开发、文化创意园区建设等文化类项目，也涵盖了旅游度假区建设、旅游酒店民宿打造等旅游类项目，既有旅游景区开发、文商旅综合体建设等传统业态项目，也有元宇宙基地、沉浸式演艺等新业态项目，充分体现了我省文化旅游发展的特点和趋势。二是引客入豫项目签约。主要是我省文旅部门、文旅企业与头部旅行商、知名OTA平台、重点客源地文旅部门等签订引客入豫协议等，持续拓展省外客源市场。\n",
            "\n"
          ]
        }
      ],
      "source": [
        "# step1 加载文件\n",
        "with open('../data/文本.txt', 'r', encoding='utf-8') as file:\n",
        "  docs = file.readlines()\n",
        "  # docs = file.readlines()[:100] # ⚠️⚠️⚠️测试的时候可以只看前100条数据⚠️⚠️⚠️\n",
        "print('文本条数: ', len(docs))\n",
        "print('预览第一条: ', docs[0])"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "45994fa9",
      "metadata": {},
      "source": [
        "# 生成词向量"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "id": "80e65b4d-6713-4e3c-acf0-f989a5db8e35",
      "metadata": {
        "tags": []
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Batches: 100%|██████████| 4/4 [00:07<00:00,  1.87s/it]"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "<class 'numpy.ndarray'> (100, 384)\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        }
      ],
      "source": [
        "embedding_model = SentenceTransformer(\n",
        "  'paraphrase-multilingual-MiniLM-L12-v2',\n",
        ")\n",
        "embeddings = embedding_model.encode(docs, show_progress_bar=True)\n",
        "print(type(embeddings), embeddings.shape)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "7d965cf2",
      "metadata": {},
      "source": [
        "# 保存"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "id": "f78d7f2a-a39b-4dd5-a83e-04fbf721682e",
      "metadata": {
        "tags": []
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "<class 'numpy.ndarray'> (100, 384)\n"
          ]
        }
      ],
      "source": [
        "np.save('emb.npy', embeddings)\n",
        "embeddings = np.load('emb.npy') # 加载回来，看一下效果\n",
        "print(type(embeddings), embeddings.shape)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
